Age of required CRAN packages



In [5]:

    
import pandas
from matplotlib import pyplot as plt
from matplotlib_venn import venn3, venn2

%matplotlib inline
from IPython.display import set_matplotlib_formats
#set_matplotlib_formats('pdf')

cran_release = pandas.DataFrame.from_csv('../data/cran-packages-150601.csv', index_col=None)
data = pandas.DataFrame.from_csv('../data/github-cran-bioc-alldata-150420.csv', index_col=None)



In [6]:

    
R_packages = ('R MASS Matrix base boot class cluster codetools compiler datasets foreign grDevices ' +
        'graphics grid lattice methods mgcv nlme nnet parallel rpart ' +
        'spatial splines stats stats4 survival tcltk tools translations utils').split(' ')



In [7]:

    
cran_release = cran_release.sort('mtime').drop_duplicates('package', take_last=False).rename(columns={'package': 'Package'})[['Package', 'mtime']]



In [8]:

    
data = data.query('Source == "cran" or Source == "github"').sort('Date').drop_duplicates(('Package', 'Source'), take_last=True)[['Package', 'Version', 'Source', 'Date', 'Depends', 'Imports']]
data = data.fillna('')



In [9]:

    
packages = {}

for idx, row in data.iterrows():
    package = packages.setdefault(row['Package'], {})
    deps = [x.strip() for x in row['Depends'].split(' ') + row['Imports'].split(' ') if len(x.strip())>0]
    package[row['Source']] = [x for x in deps if x not in R_packages]



In [10]:

    
cran_required = {'github': set(), 'cran': set()}

for name, package in packages.iteritems():
    for source, deps in package.iteritems():
        for dep in deps: 
            if packages.get(dep, {}).get('cran', None) is not None:
                cran_required[source].add(dep)



In [11]:

    
venn2((cran_required['github'], cran_required['cran']), ('github', 'cran'))









    Out[11]:





<matplotlib_venn._common.VennDiagram instance at 0x7fa27ce9de60>



In [12]:

    
required = data.query('Source == "cran"')[['Package', 'Date']]
required = required.merge(cran_release, on='Package', how='left').set_index('Package')



In [13]:

    
required['GitHub'] = required['CRAN'] = required['GitHubOnly'] = required['CRANOnly'] = required['Both'] = pandas.np.nan



In [14]:

    
for name in cran_required['github']:
    required.loc[name, 'GitHub'] = 1
for name in cran_required['cran']:
    required.loc[name, 'CRAN'] = 1



In [15]:

    
required[:10]









    Out[15]:






  
    
      
      Date
      mtime
      GitHub
      CRAN
      GitHubOnly
      CRANOnly
      Both
    
    
      Package
      
      
      
      
      
      
      
    
  
  
    
      A3
      2013-09-03 00:00:00
      2013-02-07 10:00:29
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      ABCExtremes
      2013-09-03 00:00:00
      2013-05-15 10:45:56
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      ABCp2
      2013-09-03 00:00:00
      2013-04-10 17:04:22
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      ACCLMA
      2013-09-03 00:00:00
      2012-10-29 13:13:35
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      ADGofTest
      2013-09-03 00:00:00
      2009-07-18 17:21:36
      1
      1
      NaN
      NaN
      NaN
    
    
      AIM
      2013-09-03 00:00:00
      2010-04-05 21:01:23
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      ALS
      2013-09-03 00:00:00
      2008-08-06 19:06:23
      1
      NaN
      NaN
      NaN
      NaN
    
    
      AMAP.Seq
      2013-09-03 00:00:00
      2012-06-19 16:55:48
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      AMGET
      2013-09-03 00:00:00
      2013-08-02 14:19:55
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      ANN
      2013-09-03 00:00:00
      2011-10-20 10:46:23
      NaN
      NaN
      NaN
      NaN
      NaN



In [16]:

    
def __F(row):
    if row['GitHub'] == 1 and row['CRAN'] == 1:
        row['Both'] = 1
    else:
        if row['GitHub'] == 1:
            row['GitHubOnly'] = 1
        elif row['CRAN'] == 1:
            row['CRANOnly'] = 1
    return row



In [17]:

    
fields = ['GitHub', 'CRAN', 'GitHubOnly', 'CRANOnly', 'Both']

d = required.apply(__F, axis=1)
d = d[['mtime'] + fields]
d['mtime'] = pandas.to_datetime(d['mtime'])
d = d.set_index('mtime').sort_index()

d.cumsum().fillna(method='pad').plot(figsize=(15,6))









    Out[17]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fa26fe77e10>



In [18]:

    
d['days'] = (pandas.to_datetime('2015-06-01') - d.index).astype('timedelta64[D]')

for field in fields:
    d['{}D'.format(field)] = d['days'] * d[field]



In [19]:

    
d[['{}D'.format(field) for field in fields]].plot(kind='box')









    Out[19]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fa26fe8ca10>



In [20]:

    
from scipy.stats import mannwhitneyu
from scipy.stats import norm
from math import sqrt

Ud = {}
zd = {}
pd = {}

for field1 in fields:
    for field2 in fields:
        d1, d2 = d[field1+'D'].dropna(), d[field2+'D'].dropna()
        n1, n2 = len(d1), len(d2)
        u, p = mannwhitneyu(d1, d2, use_continuity=False)
        Ud.setdefault(field1, {})[field2] = u        
        zd.setdefault(field1, {})[field2] = (u - (n1 * n2) / 2.0) / sqrt(n1 * n2 * (n1 + n2 + 1) / 12.0)
        pd.setdefault(field1, {})[field2] = 2 * norm.cdf(zd[field1][field2])



In [21]:

    
pandas.DataFrame.from_dict(Ud, orient='index').sort_index().sort_index(axis=1)

Care: p-value must be compared to a global $\alpha$. Under $\alpha=0.05$, the p-value has to be multiplied by 10 here (because we only look at the lower part of the matrix)



In [22]:

    
pandas.DataFrame.from_dict(pd, orient='index').sort_index().sort_index(axis=1)









    Out[22]:






  
    
      
      Both
      CRAN
      CRANOnly
      GitHub
      GitHubOnly
    
  
  
    
      Both
      1.000000e+00
      9.171233e-04
      2.490240e-10
      1.010140e-07
      6.865672e-39
    
    
      CRAN
      9.171233e-04
      1.000000e+00
      6.542091e-05
      5.770613e-03
      4.152996e-34
    
    
      CRANOnly
      2.490240e-10
      6.542091e-05
      1.000000e+00
      1.704979e-01
      1.035604e-18
    
    
      GitHub
      1.010140e-07
      5.770613e-03
      1.704979e-01
      1.000000e+00
      2.370454e-22
    
    
      GitHubOnly
      6.865672e-39
      4.152996e-34
      1.035604e-18
      2.370454e-22
      1.000000e+00



In [23]:

    
pandas.DataFrame.from_dict(zd, orient='index').sort_index().sort_index(axis=1)

	Both	CRAN	CRANOnly	GitHub	GitHubOnly
Both	425964.5	689044.0	263079.5	517238.0	91273.5
CRAN	689044.0	1313820.5	506681.5	984911.5	177772.5
CRANOnly	263079.5	506681.5	243602.0	434142.5	86499.0
GitHub	517238.0	984911.5	434142.5	834632.0	159354.0
GitHubOnly	91273.5	177772.5	86499.0	159354.0	68080.5

	Both	CRAN	CRANOnly	GitHub	GitHubOnly
Both	0.000000	-3.314789	-6.327587	-5.324890	-13.044107
CRAN	-3.314789	0.000000	-3.992353	-2.760539	-12.176382
CRANOnly	-6.327587	-3.992353	0.000000	-1.370606	-8.831198
GitHub	-5.324890	-2.760539	-1.370606	0.000000	-9.724509
GitHubOnly	-13.044107	-12.176382	-8.831198	-9.724509	0.000000

	Date	mtime	GitHub	CRAN	GitHubOnly	CRANOnly	Both
Package
A3	2013-09-03 00:00:00	2013-02-07 10:00:29	NaN	NaN	NaN	NaN	NaN
ABCExtremes	2013-09-03 00:00:00	2013-05-15 10:45:56	NaN	NaN	NaN	NaN	NaN
ABCp2	2013-09-03 00:00:00	2013-04-10 17:04:22	NaN	NaN	NaN	NaN	NaN
ACCLMA	2013-09-03 00:00:00	2012-10-29 13:13:35	NaN	NaN	NaN	NaN	NaN
ADGofTest	2013-09-03 00:00:00	2009-07-18 17:21:36	1	1	NaN	NaN	NaN
AIM	2013-09-03 00:00:00	2010-04-05 21:01:23	NaN	NaN	NaN	NaN	NaN
ALS	2013-09-03 00:00:00	2008-08-06 19:06:23	1	NaN	NaN	NaN	NaN
AMAP.Seq	2013-09-03 00:00:00	2012-06-19 16:55:48	NaN	NaN	NaN	NaN	NaN
AMGET	2013-09-03 00:00:00	2013-08-02 14:19:55	NaN	NaN	NaN	NaN	NaN
ANN	2013-09-03 00:00:00	2011-10-20 10:46:23	NaN	NaN	NaN	NaN	NaN

	Both	CRAN	CRANOnly	GitHub	GitHubOnly
Both	1.000000e+00	9.171233e-04	2.490240e-10	1.010140e-07	6.865672e-39
CRAN	9.171233e-04	1.000000e+00	6.542091e-05	5.770613e-03	4.152996e-34
CRANOnly	2.490240e-10	6.542091e-05	1.000000e+00	1.704979e-01	1.035604e-18
GitHub	1.010140e-07	5.770613e-03	1.704979e-01	1.000000e+00	2.370454e-22
GitHubOnly	6.865672e-39	4.152996e-34	1.035604e-18	2.370454e-22	1.000000e+00